global root_dir = "`1'"

include "$root_dir/code/config/config.do"


cap noi log using ${log_dir}/ipc_cpc_codes.log, replace name(dat)

*Handle empty arguments
global arg1 = cond("`2'" == "___EMPTY___", "", "`2'")
global arg2 = cond("`3'" == "___EMPTY___", "", "`3'")
global arg3 = cond("`4'" == "___EMPTY___", "", "`4'")
global arg4 = cond("`5'" == "___EMPTY___", "", "`5'")

if "$arg1" != "" {
    global weight_category "$arg1"
    di "Weight category: ${weight_category}"
}

if "$arg2" != "" {
    global weight_versions "$arg2"
    di "Weight versions: ${weight_versions}"
}

if "$arg3" != "" {
    global weight_window "$arg3"
    di "Weight window: ${weight_window}"
}

if "$arg4" != "" {
	global wtype "$arg4"
}
di "${wtype}"
capture noi {

/* This do-file merges CPC and IPC classification codes, creating a dataset of all 
patent applications (appln_id) - technology codes ("c/ipc") combinations. */

tempfile cipc_appln cipc_list cipc_concordance cipc_mapping

* Put together ipc and cpc code mappings, aggregate to cipc6 (cipc4) level
use ${commondata_dir}/patstat_2018b/ipc_codes.dta, clear
ren ipc_code cpc_code
append using ${commondata_dir}/patstat_2018b/CPC_codes.dta
duplicates drop
gen slashpos = strpos(cpc_code,"/")

* this creates "" if ipc_code is an IPC4 code only in the data
gen cipc6 = substr(cpc_code,1,slashpos-1) 
drop slashpos
replace cipc6 = substr(cpc_code,1,4) if cipc6 == ""
keep appln_id cipc6 
duplicates drop
save `cipc_appln'

* Extract list of cipc6 codes
keep cipc6 
duplicates drop
save `cipc_list'

* Prepare concordance mapping
import delim ${commondata_dir}/cpc_ipc_concordance/cpc_ipc_concordance.txt, varnames(1) clear 
* downloaded from https://www.cooperativepatentclassification.org/cpc/concordances/cpc-ipc-concordance.txt
* on 2023 march 12
* February 2023 version (2023.02)
gen slashpos_cpc = strpos(cpcgroup,"/")
gen cpc6 = substr(cpcgroup,1,slashpos_cpc-1) 
gen slashpos_ipc = strpos(ipcgroup,"/")
gen ipc6 = substr(ipcgroup,1,slashpos_ipc-1) 
duplicates drop cpc6, force
rename cpc6 cipc6
keep cipc6 ipc6
save `cipc_concordance'

* Apply concordance to list of cipc6 codes, apply best possible rule of thumb for the 6% of codes not in the concordance
* i.e., merge any 2000-series code that has a IPC equivalent having removed the 2 and leading zeros
use `cipc_list', clear
mmerge cipc6 using `cipc_concordance', unmatched(master)
gen cipc6_mod = cipc6
replace cipc6_mod = ipc6 if ipc6 != ""
gen cipc6_handmod = cipc6
replace cipc6_handmod = substr(cipc6,1,4) + ustrregexrf(substr(cipc6,7,.),"^0+","") if strlen(cipc6) == 8 & substr(cipc6,5,1)=="2"
bys cipc6_handmod: egen num_codes = count(cipc6_handmod)
replace cipc6_mod = cipc6_handmod if num_codes >= 2 & _merge == 1 & strlen(cipc6) == 8 & substr(cipc6,5,1)=="2"
keep cipc6 cipc6_mod
save `cipc_mapping'

* Apply map to data, save
use `cipc_appln'
mmerge cipc6 using `cipc_mapping', unmatched(master)
replace cipc6 = cipc6_mod
keep appln_id cipc6
compress
save ${dataset_dir}/patstat_orbis/cipc_codes.dta, replace

}
if _rc == 0 {
    display "Execution finished successfully."
}
else {
    display "Execution finished with errors."
}

cap log close dat